{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import gensim\n",
    "from gensim import corpora, models\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.parsing.preprocessing import STOPWORDS\n",
    "from nltk.stem import WordNetLemmatizer, SnowballStemmer\n",
    "from nltk.stem.porter import *\n",
    "from pprint import pprint\n",
    "import numpy as np\n",
    "import nltk\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据下载地址:https://www.kaggle.com/therohk/million-headlines/data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "nltk.download('wordnet')\n",
    "data = pd.read_csv('./data/abcnews-date-text.csv', error_bad_lines=False);\n",
    "data_text = data[['headline_text']]\n",
    "data_text['index'] = data_text.index\n",
    "documents = data_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1103663\n",
      "                                       headline_text  index\n",
      "0  aba decides against community broadcasting lic...      0\n",
      "1     act fire witnesses must be aware of defamation      1\n",
      "2     a g calls for infrastructure protection summit      2\n",
      "3           air nz staff in aust strike for pay rise      3\n",
      "4      air nz strike to affect australian travellers      4\n"
     ]
    }
   ],
   "source": [
    "print(len(documents))\n",
    "print(documents[:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Lemmatize example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "go\n"
     ]
    }
   ],
   "source": [
    "print(WordNetLemmatizer().lemmatize('went', pos='v'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stemmer Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>original word</th>\n",
       "      <th>stemmed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>caresses</td>\n",
       "      <td>caress</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>flies</td>\n",
       "      <td>fli</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>dies</td>\n",
       "      <td>die</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>mules</td>\n",
       "      <td>mule</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denied</td>\n",
       "      <td>deni</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>died</td>\n",
       "      <td>die</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>agreed</td>\n",
       "      <td>agre</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>owned</td>\n",
       "      <td>own</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>humbled</td>\n",
       "      <td>humbl</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>sized</td>\n",
       "      <td>size</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>meeting</td>\n",
       "      <td>meet</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>stating</td>\n",
       "      <td>state</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>siezing</td>\n",
       "      <td>siez</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>itemization</td>\n",
       "      <td>item</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>sensational</td>\n",
       "      <td>sensat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>traditional</td>\n",
       "      <td>tradit</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>reference</td>\n",
       "      <td>refer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>colonizer</td>\n",
       "      <td>colon</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>plotted</td>\n",
       "      <td>plot</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   original word stemmed\n",
       "0       caresses  caress\n",
       "1          flies     fli\n",
       "2           dies     die\n",
       "3          mules    mule\n",
       "4         denied    deni\n",
       "5           died     die\n",
       "6         agreed    agre\n",
       "7          owned     own\n",
       "8        humbled   humbl\n",
       "9          sized    size\n",
       "10       meeting    meet\n",
       "11       stating   state\n",
       "12       siezing    siez\n",
       "13   itemization    item\n",
       "14   sensational  sensat\n",
       "15   traditional  tradit\n",
       "16     reference   refer\n",
       "17     colonizer   colon\n",
       "18       plotted    plot"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stemmer = SnowballStemmer('english')\n",
    "original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', \n",
    "           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', \n",
    "           'traditional', 'reference', 'colonizer','plotted']\n",
    "singles = [stemmer.stem(plural) for plural in original_words]\n",
    "pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "stemmer = SnowballStemmer('english')\n",
    "#提取词干,词性还原\n",
    "def lemmatize_stemming(text):\n",
    "    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))\n",
    "\n",
    "#过滤停用词和长度小于3的单词\n",
    "def preprocess(text):\n",
    "    result = []\n",
    "    for token in gensim.utils.simple_preprocess(text):\n",
    "        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:\n",
    "            result.append(lemmatize_stemming(token))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original document: \n",
      "['more', 'women', 'urged', 'to', 'become', 'councillors']\n",
      "\n",
      "\n",
      " tokenized and lemmatized document: \n",
      "['women', 'urg', 'councillor']\n"
     ]
    }
   ],
   "source": [
    "doc_sample = documents[documents['index'] == 100].values[0][0]\n",
    "\n",
    "print('original document: ')\n",
    "words = []\n",
    "for word in doc_sample.split(' '):\n",
    "    words.append(word)\n",
    "print(words)\n",
    "print('\\n\\n tokenized and lemmatized document: ')\n",
    "print(preprocess(doc_sample))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0            [decid, communiti, broadcast, licenc]\n",
       "1                               [wit, awar, defam]\n",
       "2           [call, infrastructur, protect, summit]\n",
       "3                      [staff, aust, strike, rise]\n",
       "4             [strike, affect, australian, travel]\n",
       "5               [ambiti, olsson, win, tripl, jump]\n",
       "6           [antic, delight, record, break, barca]\n",
       "7    [aussi, qualifi, stosur, wast, memphi, match]\n",
       "8            [aust, address, secur, council, iraq]\n",
       "9                         [australia, lock, timet]\n",
       "Name: headline_text, dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_docs = documents['headline_text'].map(preprocess)\n",
    "processed_docs[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bag of words on the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 decid\n",
      "1 communiti\n",
      "2 licenc\n",
      "3 broadcast\n",
      "4 awar\n",
      "5 defam\n",
      "6 wit\n",
      "7 call\n",
      "8 summit\n",
      "9 infrastructur\n",
      "number of total words: 62240\n"
     ]
    }
   ],
   "source": [
    "dictionary = gensim.corpora.Dictionary(processed_docs)\n",
    "for i in range(10):\n",
    "    print(i,dictionary[i])\n",
    "print('number of total words:',len(dictionary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 1319\n",
      "1 5640\n",
      "2 1181\n",
      "3 375\n",
      "4 543\n",
      "5 372\n",
      "6 1705\n",
      "7 9558\n",
      "8 1041\n",
      "9 1060\n"
     ]
    }
   ],
   "source": [
    "for i in range(10):\n",
    "    print(i,dictionary.dfs[i])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of total words: 14142\n"
     ]
    }
   ],
   "source": [
    "dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)\n",
    "len(dictionary)\n",
    "print('number of total words:',len(dictionary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(2963, 1), (3393, 1), (4956, 1), (10226, 1)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n",
    "bow_corpus[4310]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['decid', 'communiti', 'broadcast', 'licenc']\n",
      "[(5196, 1), (5568, 1), (6334, 1), (8141, 1)]\n"
     ]
    }
   ],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n",
    "print(processed_docs[0])\n",
    "print(bow_corpus[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Word 5196 (\"decid\") appears 1 time.\n",
      "Word 5568 (\"communiti\") appears 1 time.\n",
      "Word 6334 (\"licenc\") appears 1 time.\n",
      "Word 8141 (\"broadcast\") appears 1 time.\n"
     ]
    }
   ],
   "source": [
    "bow_doc_0 = bow_corpus[0]\n",
    "for i in range(len(bow_doc_0)):\n",
    "    print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_0[i][0], \n",
    "                                               dictionary[bow_doc_0[i][0]], \n",
    "bow_doc_0[i][1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(5196, 0.4964985175717023),\n",
      " (5568, 0.38929654337861147),\n",
      " (6334, 0.5046520327464028),\n",
      " (8141, 0.5892908867507543)]\n"
     ]
    }
   ],
   "source": [
    "tfidf = models.TfidfModel(bow_corpus)\n",
    "corpus_tfidf = tfidf[bow_corpus]\n",
    "for doc in corpus_tfidf:\n",
    "    pprint(doc)\n",
    "    break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Running LDA using Bag of Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic: 0 \n",
      "Words: 0.016*\"council\" + 0.014*\"plan\" + 0.013*\"tasmanian\" + 0.012*\"turnbul\" + 0.011*\"work\" + 0.011*\"servic\" + 0.010*\"say\" + 0.009*\"park\" + 0.009*\"govern\" + 0.008*\"green\"\n",
      "Topic: 1 \n",
      "Words: 0.026*\"year\" + 0.019*\"market\" + 0.017*\"live\" + 0.017*\"australian\" + 0.015*\"rise\" + 0.013*\"price\" + 0.013*\"share\" + 0.013*\"farmer\" + 0.012*\"busi\" + 0.011*\"bank\"\n",
      "Topic: 2 \n",
      "Words: 0.022*\"govern\" + 0.017*\"elect\" + 0.017*\"interview\" + 0.014*\"say\" + 0.013*\"labor\" + 0.013*\"school\" + 0.012*\"leagu\" + 0.012*\"fund\" + 0.012*\"feder\" + 0.011*\"budget\"\n",
      "Topic: 3 \n",
      "Words: 0.029*\"melbourn\" + 0.011*\"hold\" + 0.011*\"game\" + 0.009*\"turn\" + 0.008*\"go\" + 0.008*\"video\" + 0.007*\"care\" + 0.007*\"climat\" + 0.007*\"season\" + 0.007*\"star\"\n",
      "Topic: 4 \n",
      "Words: 0.022*\"nation\" + 0.021*\"report\" + 0.016*\"rural\" + 0.013*\"health\" + 0.012*\"concern\" + 0.011*\"chang\" + 0.010*\"senat\" + 0.009*\"find\" + 0.009*\"parti\" + 0.009*\"liber\"\n",
      "Topic: 5 \n",
      "Words: 0.049*\"australia\" + 0.020*\"south\" + 0.019*\"north\" + 0.018*\"coast\" + 0.015*\"final\" + 0.014*\"china\" + 0.014*\"miss\" + 0.014*\"world\" + 0.013*\"hour\" + 0.013*\"west\"\n",
      "Topic: 6 \n",
      "Words: 0.033*\"queensland\" + 0.027*\"adelaid\" + 0.024*\"home\" + 0.023*\"open\" + 0.018*\"donald\" + 0.018*\"win\" + 0.015*\"break\" + 0.014*\"take\" + 0.012*\"lead\" + 0.011*\"hobart\"\n",
      "Topic: 7 \n",
      "Words: 0.020*\"test\" + 0.019*\"countri\" + 0.016*\"attack\" + 0.014*\"claim\" + 0.013*\"kill\" + 0.013*\"protest\" + 0.012*\"say\" + 0.010*\"island\" + 0.010*\"announc\" + 0.010*\"minist\"\n",
      "Topic: 8 \n",
      "Words: 0.054*\"polic\" + 0.031*\"sydney\" + 0.020*\"crash\" + 0.020*\"perth\" + 0.018*\"die\" + 0.016*\"shoot\" + 0.014*\"death\" + 0.013*\"woman\" + 0.013*\"investig\" + 0.012*\"dead\"\n",
      "Topic: 9 \n",
      "Words: 0.033*\"trump\" + 0.031*\"charg\" + 0.030*\"court\" + 0.023*\"murder\" + 0.019*\"face\" + 0.016*\"jail\" + 0.015*\"accus\" + 0.015*\"tasmania\" + 0.014*\"alleg\" + 0.014*\"child\"\n"
     ]
    }
   ],
   "source": [
    "lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)\n",
    "for idx, topic in lda_model.print_topics(-1):\n",
    "    print('Topic: {} \\nWords: {}'.format(idx, topic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Running LDA using TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic: 0 Word: 0.010*\"govern\" + 0.008*\"health\" + 0.007*\"rural\" + 0.007*\"fund\" + 0.006*\"drum\" + 0.006*\"council\" + 0.006*\"plan\" + 0.005*\"budget\" + 0.005*\"say\" + 0.005*\"chang\"\n",
      "Topic: 1 Word: 0.017*\"polic\" + 0.017*\"charg\" + 0.014*\"murder\" + 0.013*\"crash\" + 0.012*\"woman\" + 0.010*\"court\" + 0.010*\"alleg\" + 0.010*\"jail\" + 0.010*\"death\" + 0.009*\"shoot\"\n",
      "Topic: 2 Word: 0.010*\"leagu\" + 0.008*\"hill\" + 0.008*\"australia\" + 0.007*\"world\" + 0.006*\"rugbi\" + 0.006*\"octob\" + 0.006*\"novemb\" + 0.006*\"john\" + 0.006*\"peter\" + 0.006*\"tuesday\"\n",
      "Topic: 3 Word: 0.014*\"interview\" + 0.007*\"final\" + 0.007*\"open\" + 0.007*\"septemb\" + 0.007*\"monday\" + 0.006*\"thursday\" + 0.006*\"climat\" + 0.005*\"miss\" + 0.005*\"april\" + 0.005*\"shark\"\n",
      "Topic: 4 Word: 0.007*\"juli\" + 0.007*\"australia\" + 0.006*\"south\" + 0.006*\"korea\" + 0.006*\"june\" + 0.005*\"fiji\" + 0.005*\"china\" + 0.004*\"north\" + 0.004*\"russia\" + 0.004*\"protest\"\n",
      "Topic: 5 Word: 0.007*\"asylum\" + 0.006*\"seeker\" + 0.006*\"island\" + 0.006*\"dairi\" + 0.004*\"firefight\" + 0.004*\"energi\" + 0.004*\"creek\" + 0.004*\"volunt\" + 0.004*\"ship\" + 0.004*\"interview\"\n",
      "Topic: 6 Word: 0.017*\"news\" + 0.008*\"marriag\" + 0.008*\"michael\" + 0.007*\"david\" + 0.006*\"toni\" + 0.006*\"rural\" + 0.006*\"mount\" + 0.006*\"busi\" + 0.006*\"jam\" + 0.006*\"nation\"\n",
      "Topic: 7 Word: 0.024*\"trump\" + 0.010*\"grandstand\" + 0.006*\"histori\" + 0.006*\"capit\" + 0.006*\"quiz\" + 0.005*\"spring\" + 0.005*\"memori\" + 0.004*\"bendigo\" + 0.004*\"chris\" + 0.004*\"drone\"\n",
      "Topic: 8 Word: 0.024*\"countri\" + 0.022*\"hour\" + 0.011*\"market\" + 0.011*\"turnbul\" + 0.008*\"weather\" + 0.008*\"share\" + 0.007*\"live\" + 0.006*\"price\" + 0.005*\"kid\" + 0.005*\"wall\"\n",
      "Topic: 9 Word: 0.014*\"coast\" + 0.010*\"podcast\" + 0.009*\"donald\" + 0.009*\"queensland\" + 0.008*\"gold\" + 0.008*\"north\" + 0.007*\"sport\" + 0.006*\"west\" + 0.006*\"rain\" + 0.006*\"south\"\n"
     ]
    }
   ],
   "source": [
    "lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)\n",
    "\n",
    "for idx, topic in lda_model_tfidf.print_topics(-1):\n",
    "    print('Topic: {} Word: {}'.format(idx, topic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Performance evaluation by classifying sample document using LDA Bag of Words model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "more women urged to become councillors\n",
      "['women', 'urg', 'councillor']\n",
      "-------------------------------------\n",
      "\n",
      "Score: 0.7749839460818331\t \n",
      "Topic0: 0.016*\"council\" + 0.014*\"plan\" + 0.013*\"tasmanian\" + 0.012*\"turnbul\" + 0.011*\"work\" + 0.011*\"servic\" + 0.010*\"say\" + 0.009*\"park\" + 0.009*\"govern\" + 0.008*\"green\"\n",
      "\n",
      "Score: 0.025004759463502946\t \n",
      "Topic6: 0.033*\"queensland\" + 0.027*\"adelaid\" + 0.024*\"home\" + 0.023*\"open\" + 0.018*\"donald\" + 0.018*\"win\" + 0.015*\"break\" + 0.014*\"take\" + 0.012*\"lead\" + 0.011*\"hobart\"\n",
      "\n",
      "Score: 0.025003833795233864\t \n",
      "Topic4: 0.022*\"nation\" + 0.021*\"report\" + 0.016*\"rural\" + 0.013*\"health\" + 0.012*\"concern\" + 0.011*\"chang\" + 0.010*\"senat\" + 0.009*\"find\" + 0.009*\"parti\" + 0.009*\"liber\"\n",
      "\n",
      "Score: 0.02500318640073601\t \n",
      "Topic7: 0.020*\"test\" + 0.019*\"countri\" + 0.016*\"attack\" + 0.014*\"claim\" + 0.013*\"kill\" + 0.013*\"protest\" + 0.012*\"say\" + 0.010*\"island\" + 0.010*\"announc\" + 0.010*\"minist\"\n",
      "\n",
      "Score: 0.025001921876355325\t \n",
      "Topic2: 0.022*\"govern\" + 0.017*\"elect\" + 0.017*\"interview\" + 0.014*\"say\" + 0.013*\"labor\" + 0.013*\"school\" + 0.012*\"leagu\" + 0.012*\"fund\" + 0.012*\"feder\" + 0.011*\"budget\"\n",
      "\n",
      "Score: 0.025001562786080105\t \n",
      "Topic5: 0.049*\"australia\" + 0.020*\"south\" + 0.019*\"north\" + 0.018*\"coast\" + 0.015*\"final\" + 0.014*\"china\" + 0.014*\"miss\" + 0.014*\"world\" + 0.013*\"hour\" + 0.013*\"west\"\n",
      "\n",
      "Score: 0.025000658849745623\t \n",
      "Topic1: 0.026*\"year\" + 0.019*\"market\" + 0.017*\"live\" + 0.017*\"australian\" + 0.015*\"rise\" + 0.013*\"price\" + 0.013*\"share\" + 0.013*\"farmer\" + 0.012*\"busi\" + 0.011*\"bank\"\n",
      "\n",
      "Score: 0.02500013074618247\t \n",
      "Topic3: 0.029*\"melbourn\" + 0.011*\"hold\" + 0.011*\"game\" + 0.009*\"turn\" + 0.008*\"go\" + 0.008*\"video\" + 0.007*\"care\" + 0.007*\"climat\" + 0.007*\"season\" + 0.007*\"star\"\n",
      "\n",
      "Score: 0.025000000000167596\t \n",
      "Topic8: 0.054*\"polic\" + 0.031*\"sydney\" + 0.020*\"crash\" + 0.020*\"perth\" + 0.018*\"die\" + 0.016*\"shoot\" + 0.014*\"death\" + 0.013*\"woman\" + 0.013*\"investig\" + 0.012*\"dead\"\n",
      "\n",
      "Score: 0.02500000000016317\t \n",
      "Topic9: 0.033*\"trump\" + 0.031*\"charg\" + 0.030*\"court\" + 0.023*\"murder\" + 0.019*\"face\" + 0.016*\"jail\" + 0.015*\"accus\" + 0.015*\"tasmania\" + 0.014*\"alleg\" + 0.014*\"child\"\n"
     ]
    }
   ],
   "source": [
    "docId=100\n",
    "print(documents['headline_text'][docId])\n",
    "print(processed_docs[docId])\n",
    "print('-------------------------------------')\n",
    "for index, score in sorted(lda_model[bow_corpus[docId]], key=lambda tup: -1*tup[1]):\n",
    "    print(\"\\nScore: {}\\t \\nTopic{}: {}\".format(score,index, lda_model.print_topic(index, 10)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Performance evaluation by classifying sample document using LDA TF-IDF mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "more women urged to become councillors\n",
      "['women', 'urg', 'councillor']\n",
      "-------------------------------------\n",
      "\n",
      "Score: 0.7749672025110785\t \n",
      "Topic0: 0.010*\"govern\" + 0.008*\"health\" + 0.007*\"rural\" + 0.007*\"fund\" + 0.006*\"drum\" + 0.006*\"council\" + 0.006*\"plan\" + 0.005*\"budget\" + 0.005*\"say\" + 0.005*\"chang\"\n",
      "\n",
      "Score: 0.02501242505708388\t \n",
      "Topic8: 0.024*\"countri\" + 0.022*\"hour\" + 0.011*\"market\" + 0.011*\"turnbul\" + 0.008*\"weather\" + 0.008*\"share\" + 0.007*\"live\" + 0.006*\"price\" + 0.005*\"kid\" + 0.005*\"wall\"\n",
      "\n",
      "Score: 0.025005198952628434\t \n",
      "Topic2: 0.010*\"leagu\" + 0.008*\"hill\" + 0.008*\"australia\" + 0.007*\"world\" + 0.006*\"rugbi\" + 0.006*\"octob\" + 0.006*\"novemb\" + 0.006*\"john\" + 0.006*\"peter\" + 0.006*\"tuesday\"\n",
      "\n",
      "Score: 0.025002765044766295\t \n",
      "Topic4: 0.007*\"juli\" + 0.007*\"australia\" + 0.006*\"south\" + 0.006*\"korea\" + 0.006*\"june\" + 0.005*\"fiji\" + 0.005*\"china\" + 0.004*\"north\" + 0.004*\"russia\" + 0.004*\"protest\"\n",
      "\n",
      "Score: 0.02500241208928745\t \n",
      "Topic7: 0.024*\"trump\" + 0.010*\"grandstand\" + 0.006*\"histori\" + 0.006*\"capit\" + 0.006*\"quiz\" + 0.005*\"spring\" + 0.005*\"memori\" + 0.004*\"bendigo\" + 0.004*\"chris\" + 0.004*\"drone\"\n",
      "\n",
      "Score: 0.02500239368623071\t \n",
      "Topic5: 0.007*\"asylum\" + 0.006*\"seeker\" + 0.006*\"island\" + 0.006*\"dairi\" + 0.004*\"firefight\" + 0.004*\"energi\" + 0.004*\"creek\" + 0.004*\"volunt\" + 0.004*\"ship\" + 0.004*\"interview\"\n",
      "\n",
      "Score: 0.025002148540928876\t \n",
      "Topic9: 0.014*\"coast\" + 0.010*\"podcast\" + 0.009*\"donald\" + 0.009*\"queensland\" + 0.008*\"gold\" + 0.008*\"north\" + 0.007*\"sport\" + 0.006*\"west\" + 0.006*\"rain\" + 0.006*\"south\"\n",
      "\n",
      "Score: 0.025002102978203173\t \n",
      "Topic6: 0.017*\"news\" + 0.008*\"marriag\" + 0.008*\"michael\" + 0.007*\"david\" + 0.006*\"toni\" + 0.006*\"rural\" + 0.006*\"mount\" + 0.006*\"busi\" + 0.006*\"jam\" + 0.006*\"nation\"\n",
      "\n",
      "Score: 0.025001804591399755\t \n",
      "Topic1: 0.017*\"polic\" + 0.017*\"charg\" + 0.014*\"murder\" + 0.013*\"crash\" + 0.012*\"woman\" + 0.010*\"court\" + 0.010*\"alleg\" + 0.010*\"jail\" + 0.010*\"death\" + 0.009*\"shoot\"\n",
      "\n",
      "Score: 0.025001546548392962\t \n",
      "Topic3: 0.014*\"interview\" + 0.007*\"final\" + 0.007*\"open\" + 0.007*\"septemb\" + 0.007*\"monday\" + 0.006*\"thursday\" + 0.006*\"climat\" + 0.005*\"miss\" + 0.005*\"april\" + 0.005*\"shark\"\n"
     ]
    }
   ],
   "source": [
    "print(documents['headline_text'][docId])\n",
    "print(processed_docs[docId])\n",
    "print('-------------------------------------')\n",
    "for index, score in sorted(lda_model_tfidf[bow_corpus[docId]], key=lambda tup: -1*tup[1]):\n",
    "    print(\"\\nScore: {}\\t \\nTopic{}: {}\".format(score,index, lda_model_tfidf.print_topic(index, 10)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Testing model on unseen document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.3500005565346774\t Topic: 0.054*\"polic\" + 0.031*\"sydney\" + 0.020*\"crash\" + 0.020*\"perth\" + 0.018*\"die\"\n",
      "Score: 0.1833332196787608\t Topic: 0.016*\"council\" + 0.014*\"plan\" + 0.013*\"tasmanian\" + 0.012*\"turnbul\" + 0.011*\"work\"\n",
      "Score: 0.18333311398038923\t Topic: 0.033*\"queensland\" + 0.027*\"adelaid\" + 0.024*\"home\" + 0.023*\"open\" + 0.018*\"donald\"\n",
      "Score: 0.18333310974357916\t Topic: 0.020*\"test\" + 0.019*\"countri\" + 0.016*\"attack\" + 0.014*\"claim\" + 0.013*\"kill\"\n",
      "Score: 0.016666666678315654\t Topic: 0.029*\"melbourn\" + 0.011*\"hold\" + 0.011*\"game\" + 0.009*\"turn\" + 0.008*\"go\"\n",
      "Score: 0.016666666677291653\t Topic: 0.033*\"trump\" + 0.031*\"charg\" + 0.030*\"court\" + 0.023*\"murder\" + 0.019*\"face\"\n",
      "Score: 0.016666666677209844\t Topic: 0.026*\"year\" + 0.019*\"market\" + 0.017*\"live\" + 0.017*\"australian\" + 0.015*\"rise\"\n",
      "Score: 0.01666666667708792\t Topic: 0.022*\"govern\" + 0.017*\"elect\" + 0.017*\"interview\" + 0.014*\"say\" + 0.013*\"labor\"\n",
      "Score: 0.016666666676871875\t Topic: 0.022*\"nation\" + 0.021*\"report\" + 0.016*\"rural\" + 0.013*\"health\" + 0.012*\"concern\"\n",
      "Score: 0.016666666675816455\t Topic: 0.049*\"australia\" + 0.020*\"south\" + 0.019*\"north\" + 0.018*\"coast\" + 0.015*\"final\"\n",
      "---------------------------------------------------------------------------------------------\n",
      "Score: 0.3776343720481849\t Topic: 0.029*\"melbourn\" + 0.011*\"hold\" + 0.011*\"game\" + 0.009*\"turn\" + 0.008*\"go\"\n",
      "Score: 0.3185531204641723\t Topic: 0.026*\"year\" + 0.019*\"market\" + 0.017*\"live\" + 0.017*\"australian\" + 0.015*\"rise\"\n",
      "Score: 0.18714385614699203\t Topic: 0.033*\"trump\" + 0.031*\"charg\" + 0.030*\"court\" + 0.023*\"murder\" + 0.019*\"face\"\n",
      "Score: 0.01666736283559552\t Topic: 0.022*\"nation\" + 0.021*\"report\" + 0.016*\"rural\" + 0.013*\"health\" + 0.012*\"concern\"\n",
      "Score: 0.016667093359474527\t Topic: 0.054*\"polic\" + 0.031*\"sydney\" + 0.020*\"crash\" + 0.020*\"perth\" + 0.018*\"die\"\n",
      "Score: 0.016666945816529572\t Topic: 0.049*\"australia\" + 0.020*\"south\" + 0.019*\"north\" + 0.018*\"coast\" + 0.015*\"final\"\n",
      "Score: 0.01666691727019073\t Topic: 0.016*\"council\" + 0.014*\"plan\" + 0.013*\"tasmanian\" + 0.012*\"turnbul\" + 0.011*\"work\"\n",
      "Score: 0.016666855112910793\t Topic: 0.020*\"test\" + 0.019*\"countri\" + 0.016*\"attack\" + 0.014*\"claim\" + 0.013*\"kill\"\n",
      "Score: 0.016666757511489303\t Topic: 0.022*\"govern\" + 0.017*\"elect\" + 0.017*\"interview\" + 0.014*\"say\" + 0.013*\"labor\"\n",
      "Score: 0.016666719434460256\t Topic: 0.033*\"queensland\" + 0.027*\"adelaid\" + 0.024*\"home\" + 0.023*\"open\" + 0.018*\"donald\"\n"
     ]
    }
   ],
   "source": [
    "unseen_document = \"Five bystanders shot during police shootout in New Orleans\"\n",
    "bow_vector = dictionary.doc2bow(preprocess(unseen_document))\n",
    "\n",
    "for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):\n",
    "    print(\"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5)))\n",
    "print('---------------------------------------------------------------------------------------------')\n",
    "for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):\n",
    "    print(\"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
